#! python3
# phageDisplayElisaAnalysis.py - Analyse ELISA results along with corresponding sequence data. Calculates the average of duplicates for each protein and normalizes them against the average of the negative controls/blanks. ELISA results that don't have corresponding sequencing results are removed from the final results.

####################################
#    Preamble
####################################

# Usage notes:
# * This code is dependent on the style of the worksheet used as the ELISA data source. This will be entirely based upon the output from the "phageDisplayELISA384welly" export format used with the BioTek plate reader.
# * Any assumptions that were made from previous code will be retained, i.e. if the data source is the output from "phageDisplaySeqAnalysis.py" then all alignments will exclude sequences that weren't full length and those that have premature stop codons.
# * Accepted plate format:
#     1   2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24
#   ----------------------------------------------------------------------------------------
# A | 01 01 02 02 03 03 04 04 05  05  06  06  07  07  08  08  09  09  10  10  11  11  12  12 
# B | <____________________________________empty___________________________________________>
# C | 13 13 14 14 15 15 16 16 17  17  18  18  19  19  20  20  21  21  22  22  23  23  24  24
# D | <____________________________________empty___________________________________________>
# E | 25 25 26 26 27 27 28 28 29  29  30  30  31  31  32  32  33  33  34  34  35  35  36  36
# F | <____________________________________empty___________________________________________>
# G | 37 37 38 38 39 39 40 40 41  41  42  42  43  43  44  44  45  45  46  46  47  47  48  48
# H | <____________________________________empty___________________________________________>
# I | 49 49 50 50 51 51 52 52 53  53  54  54  55  55  56  56  57  57  58  58  59  59  60  60
# J | <____________________________________empty___________________________________________>
# K | 61 61 62 62 63 63 64 64 65  65  66  66  67  67  68  68  69  69  70  70  71  71  72  72
# L | <____________________________________empty___________________________________________>
# M | 73 73 74 74 75 75 76 76 77  77  78  78  79  79  80  80  81  81  82  82  83  83  84  84
# N | <____________________________________empty___________________________________________>
# O | 85 85 86 86 87 87 88 88 89  89  90  90  91  91  92  92  93  93  94  94  95  95  96  96
# P | <___________________________________BLANKS___________________________________________>

# Compatibility notes:
# * Advised to use Biopython 1.77 and no later version. If using a more current version, change alignment codes from "alignment.format(format_spec)" to "format(alignment, format_spec".
# * If using Spyder as your IDE, use a version that isn't version 5. This version for some reason has conflicts with the xlsxwriter package and won't get past importing modules.
# * This code is confirmed to work with python 3.8.8. Later versions may work but have not been verified.
# * Confirmed to work in Windows, unconfirmed in Macs and Linux but should work in theory (may need to change lines regarding path names so that the format matches the OS, currently these are optimised for Windows' format).

# To do:
# * Add work-arounds for when code breaks; have it bypass the issue and finish analysis, leaving the user to manually analyse problem sequences.
# * To facilitate the above point, probably add functions that do nt-aa conversion, trimming, etc. outside of the program.
# * Add workaround for overflow cells (e.g. find OVRFLW, replace with 4, continue with code).
# * Add custom warnings for common errors that break script, give suggestions as to what went wrong.
# * Make code to read sequences from excel file without need for fasta file.

####################################
#    Modules
####################################

import os, re, logging, xlsxwriter, pandas, statistics
from Bio import AlignIO
from collections import Counter, OrderedDict

####################################
#    Functions
####################################

# Create an average of a list.
def aveList(list): 
    return sum(list) / len(list)
            
####################################
#    Classes
####################################

# Ordered list of counts.
class OrderedCounter(Counter, OrderedDict):
    pass

####################################
#    Code
####################################

##################
# Colours for print functions.
##################
cyan = lambda text: '\033[0;36m' + text + '\033[0m'
green = lambda text: '\033[0;32m' + text + '\033[0m'

##################
# Setup
##################

# Change working directory.
print(green('\nScript started. This will pair sequencing data with their corresponding ELISA results for comparison.') +
      cyan(''''\n\nEnter folder location/path where files are located:

This will also be the location for the final output.'''))
path = input()
path = path.replace('\\', '/')
os.chdir(path)

# Choose ELISA data source.
print(cyan('''\nEnter the raw ELISA data file name:

Must be in .xlsx format. Include the file extension in the name.'''))
elisaFile = input()
elisaFilePath = path + '/' + elisaFile
extensionRegex = re.compile(r'([.].*)')
elisaFileShort = re.sub(r"[.].*", "", elisaFile)

# Logging setup.
logging.basicConfig(filename = path + "/" + elisaFileShort + ".log",
                    level = logging.INFO,
                    format = '%(asctime)s - %(message)s',
                    filemode = 'w')
logging.info('Working directory changed to %s.' % (path))
logging.info('%s chosen as ELISA data source.' % (elisaFile))

# Choose sequence file data sources and corresponding alignment files for ELISA data.
print(cyan('''\nEnter amino acid alignment file name:

Must be in .fasta format. Include the file extension in the name.'''))
seqFile = input()
seqFilePath = path + '/' + seqFile
logging.info('%s chosen as amino acid sequence data source.' % (seqFilePath))

print(cyan('''\nEnter nucleotide alignment file name:

Must be in .fasta format. Include the file extension in the name.'''))
seqFile2 = input()
seqFilePath2 = path + '/' + seqFile2
logging.info('%s chosen as nucleotide sequence data source. Include the file extension in the name.' % (seqFilePath2)) 

##################
# Data Analysis
##################

nameRegex = re.compile(r'>(.*)')
seqRegex = re.compile(r'([A-Z]{10,})')
stopRegex = re.compile(r'([*]+[A-Z]*)')
shortNameRegex = re.compile(r'([_][M][\w]*)')
# Extract amino acid sequence names.
with open(seqFilePath, 'r') as inFile:
    lines1 = inFile.read()
    lines1 = shortNameRegex.sub('', lines1)
    nameListAa = nameRegex.findall(lines1)
logging.info('Amino acid sequence names read from %s.' % (seqFile)) 
# Extract amino acid sequences and remove unnecessary formatting. 
with open(seqFilePath, 'r') as inFile:
    lines2 = inFile.read()
    lines2 = lines2.replace('\n', '')
    lines2 = stopRegex.sub('', lines2)
    aaList = seqRegex.findall(lines2)
logging.info('Amino acid sequences read from %s.' %  (seqFile))
# Extract nucleotide sequence names and remove unnecessary formatting.
with open(seqFilePath2, 'r') as inFile:
    lines1 = inFile.read()
    lines1 = shortNameRegex.sub('', lines1)
    nameListNt = nameRegex.findall(lines1)
logging.info('Nucleotide sequence names read from %s.' %  (seqFile2)) 
# Extract nucleotide sequences.
with open(seqFilePath2, 'r') as inFile:
    lines2 = inFile.read()
    lines2 = lines2.replace('\n', '')
    lines2 = stopRegex.sub('', lines2)
    ntList = seqRegex.findall(lines2)
logging.info('Nucleotide sequences read from %s.' %  (seqFile2))
# Associate names with corresponding sequences.
aaDict = dict(zip(nameListAa, aaList))
ntDict = dict(zip(nameListNt, ntList))

# Align amino acid sequences and calculate alignment length.
alignment = AlignIO.read(seqFilePath, "fasta")
alignLen = alignment.get_alignment_length()
logging.info('Amino acid alignment length calculated.')
# Align nucleotide sequences and calculate alignment length.
alignment2 = AlignIO.read(seqFilePath2, "fasta") 
alignLen2 = alignment2.get_alignment_length()
logging.info('Nucleotide alignment length calculated.') 

# # Read ELISA data.
# Extract data and make dataframe of raw results.
allCells = pandas.read_excel(elisaFilePath, skiprows = [i for i in range(1,46)], usecols = range(2, 26))
logging.info('Raw data read from ELISA file.') 
# Extract values for negative controls/blanks.
blankCells = {'P1': allCells.iloc[15][0], 'P2': allCells.iloc[15][1], 'P3': allCells.iloc[15][2], 'P4': allCells.iloc[15][3], 'P5': allCells.iloc[15][4], 'P6': allCells.iloc[15][5], 'P7': allCells.iloc[15][6], 'P8': allCells.iloc[15][7], 'P9': allCells.iloc[15][8], 'P10': allCells.iloc[15][9], 'P11': allCells.iloc[15][10], 'P12': allCells.iloc[15][11], 'P13': allCells.iloc[15][12], 'P14': allCells.iloc[15][13], 'P15': allCells.iloc[15][14], 'P16': allCells.iloc[15][15], 'P17': allCells.iloc[15][16], 'P18': allCells.iloc[15][17], 'P19': allCells.iloc[15][18], 'P20': allCells.iloc[15][19], 'P21': allCells.iloc[15][20], 'P22': allCells.iloc[15][21], 'P23': allCells.iloc[15][22], 'P24': allCells.iloc[15][23]}
print(cyan('''\nEnter wells containing ELISA blanks:

Not case-sensitive; separate with commas (no spaces) if more than one (e.g. "p22,p23,p24").'''))
blanks = input()
blanks = blanks.upper()
blanks = blanks.split(',')
logging.info('"%s" used as negative controls/blanks.' % (blanks))
# Create list of ELISA absorbances for user-inputted blank IDs.
blankValues = list()
for i in blanks:
    for key, value in blankCells.items():
        if i in blankCells:
            try:
                blankValues.append(blankCells.get(i))
                break
            except:
                pass
# Average blanks or pass if there's only one blank.
try:
    blankAve = aveList(blankValues)
    logging.info('Blank values retrieved and averaged.')
except:
    blankAve = int(blankValues)
    logging.info('Single blank value retrieved but not cannot be averaged.')
    
# Create list of all ELISA absorbances.
cellValues = list(allCells.iloc[0]) + list(allCells.iloc[2])+ list(allCells.iloc[4]) + list(allCells.iloc[6]) + list(allCells.iloc[8]) + list(allCells.iloc[10]) + list(allCells.iloc[12]) + list(allCells.iloc[14])
cellAve = list()
for i in range(0, len(cellValues), 2):
    try:
        cellAve.append(statistics.mean([cellValues[i], cellValues[i + 1]]))
    except:
        pass
logging.info('Result wells extracted from raw data.')
        
# Normalise ELISA scores to the blank average. 
relAveList = list()
for value in cellAve:
    relAve = value/blankAve
    relAveList.append(relAve)
logging.info('Result averaged relative to average of blanks (or single blank if less than two blanks).')
    
# Remove ELISA data that don't have amino acid sequencing counterparts. 
# Extract IDs present in sequencing data.
seqPlateRegex = re.compile(r'([A-H]\d+)')
plateIDs = list()
for name in nameListAa:
    seqID = seqPlateRegex.findall(name)
    plateIDs.append(seqID)
# Turn list of lists into a flat list.
seqPlateIDs = []
for sublist in plateIDs:
    for item in sublist:
        seqPlateIDs.append(item)
# Create list of all ELISA IDs and assign corresponding numbers for indexing.
elisaPlateIDs = ['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10', 'C11', 'C12', 'D01', 'D02', 'D03', 'D04', 'D05', 'D06', 'D07', 'D08', 'D09', 'D10', 'D11', 'D12', 'E01', 'E02', 'E03', 'E04', 'E05', 'E06', 'E07', 'E08', 'E09', 'E10', 'E11', 'E12', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10', 'G11', 'G12', 'H01', 'H02', 'H03', 'H04', 'H05', 'H06', 'H07', 'H08', 'H09', 'H10', 'H11', 'H12']
wellListConversion = {k: v for k, v in zip(elisaPlateIDs, range(0,96))}
# Remove ELISA absorbances for IDs not present in sequencing data from ELISA data.
reducedrelAveListAa = []
for ID in seqPlateIDs:
    well = wellListConversion.get(ID)
    reducedrelAveListAa.append(relAveList[well])
logging.info('ELISA results without corresponding sequencing results removed.')

# Remove ELISA data that don't have nucleotide sequencing counterparts. 
# Extract IDs present in sequencing data.
plateIDs = list()
for name in nameListNt:
    seqID = seqPlateRegex.findall(name)
    plateIDs.append(seqID)
# Turn list of lists into a flat list.
seqPlateIDs = []
for sublist in plateIDs:
    for item in sublist:
        seqPlateIDs.append(item)
# Remove ELISA absorbances for IDs not present in sequencing data from ELISA data.
reducedRelAveListNt = []
for ID in seqPlateIDs:
    well = wellListConversion.get(ID)
    reducedRelAveListNt.append(relAveList[well])
logging.info('ELISA results without corresponding sequencing results removed.')

# Relate IDs to their respective ELISA absorbances (amino acids).
newNameRegex = re.compile(r'([A-H])(\d{1}$)')
newNameListAa = list()
for name in nameListAa:
    if newNameRegex.findall(name):
        newName = re.sub(r'([A-H])(\d{1}$)', r'\g<1>0\g<2>', name)
        newNameListAa.append(newName)
    else:
        newNameListAa.append(name)
newNameListAa.sort()
# Relate IDs to their respective ELISA absorbances (nucleotides).
newNameRegex = re.compile(r'([A-H])(\d{1}$)')
newNameListNt = list()
for name in nameListNt:
    if newNameRegex.findall(name):
        newName = re.sub(r'([A-H])(\d{1}$)', r'\g<1>0\g<2>', name)
        newNameListNt.append(newName)
    else:
        newNameListNt.append(name)
newNameListNt.sort()
# Create dictionaries with new reorganised names/sequences.
newaaDict = dict(zip(newNameListAa, aaList))
newntDict = dict(zip(newNameListNt, ntList))
aveCellsDictAa = dict(zip(newNameListAa, reducedrelAveListAa))
aveCellsDictNt = dict(zip(newNameListNt, reducedRelAveListNt))

##################
# Export as .xlsx.
##################

# Create workbook.
workbook = xlsxwriter.Workbook(path + '/' + elisaFileShort + '_analysed.xlsx')
logging.info('Excel spreadsheet created as "%s.xlsx".' % (elisaFileShort))

# Cell formatting rules. 
# General.
general_format = workbook.add_format()
general_format.set_align('center')
general_format.set_align('vcenter')
# Titles.
title_format = workbook.add_format({'bold': True, 'font_size': 12})
title_format.set_align('center')
# Statistics.
stats_format = workbook.add_format({'num_format': '#,##0.0'})
stats_format.set_align('center')
stats_format.set_align('vcenter')
# Wells.
wellList_format = workbook.add_format({'font_size': 11})
wellID_format = workbook.add_format({'font_size': 12})
wellID_format.set_align('center')
# Residue numbers.
residue_format = workbook.add_format({'font_size': 10})
residue_format.set_align('center')
# Sequences.
sequence_format = workbook.add_format({'font_size': 10})
sequence_format.set_align('center')
sequence_format.set_align('vcenter')
sequence_format.set_font_name('Lucida Console')
logging.info('Cell formatting rules are set.')

##################
# 'All AA Seq' worksheet.
##################

# Create worksheet for all amino acid sequences.
worksheet1 = workbook.add_worksheet('All AA Seq')
worksheet1.hide_gridlines(option=2)
worksheet1.set_column(0, 0, 15)
worksheet1.set_column(1, alignLen, 2)
worksheet1.freeze_panes(0, 1)
logging.info('All AA Seq worksheet created.')
# Write well IDs.
idRow = 2
worksheet1.write(0, 0, 'ID', title_format)
for name in newNameListAa:
    worksheet1.write(idRow, 0, name, wellID_format)
    idRow += 1
    logging.info('"%s" written to All AA Seq worksheet.' % (name))
# Write amino acid sequences.
worksheet1.write(0, 6, 'Amino Acid Sequence', title_format)
seqRow = 2
seqCol = 1
for aa in aaList:
    letterList = list(aa)
    for letter in letterList:
        worksheet1.write(seqRow, seqCol, letter, sequence_format)
        seqCol +=1
    seqRow += 1
    seqCol = 1
logging.info('All amino acid sequences written to All AA Seq worksheet.')
# Write ELISA absorbances.
absRow = 2
absCol = alignLen + 1
worksheet1.write(0, absCol, 'Normalized Absorbance', title_format)
for result in reducedrelAveListAa:
    worksheet1.write(absRow, absCol, result, stats_format)
    absRow += 1
    logging.info('"%s" written to All AA Seq worksheet.' % (result))
# Write amino acid residue numbers above sequences.
numberList = list(range(1, alignLen + 1))
residueCol = 1
for number in numberList:
    worksheet1.write(1, residueCol, number, residue_format)
    residueCol += 1

##################
# 'Unique AA Seq' worksheet.
##################

# Create worksheet for unique amino acid sequences.
worksheet2 = workbook.add_worksheet('Unique AA Seq')
worksheet2.hide_gridlines(option=2)
worksheet2.set_column(1, alignLen, 2)
worksheet2.set_column(alignLen + 2, alignLen + 5, 8)
worksheet2.freeze_panes(0, 1)
logging.info('Unique AA Seq worksheet created.')
# Create list of unique amino acid sequences ordered by frequency.
unique = OrderedCounter(aaList)
unique = unique.most_common()
uniqueDict = dict(unique)
# Write unique amino acid sequences.
worksheet2.write(0, 6, 'Amino Acid Sequence', title_format)
uniqueRow = 2
uniqueCol = 1
for seq in uniqueDict.keys():
    letterList = list(seq)
    for letter in letterList:
        worksheet2.write(uniqueRow, uniqueCol, letter, sequence_format)
        uniqueCol += 1
    uniqueRow += 1
    uniqueCol = 1
logging.info('Unique sequences written to Unique AA Seq worksheet.')
# Add counts for each unique amino acid sequence.
countRow = 2
countCol = alignLen + 1
worksheet2.write(0, alignLen + 1, 'Count', title_format)
count = list(uniqueDict.values())
for number in count:
    worksheet2.write_number(countRow, countCol, number, general_format)
    countRow += 1
logging.info('Sequence counts written to Unique AA Seq worksheet.')
# Write amino acid residue numbers above sequences.
residueCol = 1
for number in numberList:
    worksheet2.write(1, residueCol, number, residue_format)
    residueCol += 1

##################
# Ordered values: Amino Acids
##################

# Get ordered list of wells that correspond to unique sequences; necessary for subsequent statistics.
orderedSeq = []
for key in uniqueDict.keys():
    orderedSeq.append(key)
orderedIndex = []
for seq in orderedSeq:
    for key, value in newaaDict.items():
        if seq in value:
            orderedIndex.append(key)  
# Get ordered values corresponding to ordered list of wells for unique sequences; necessary for subsequent statistics.            
orderedScores = []
for index in orderedIndex:
    for ID, score in aveCellsDictAa.items():
        if index == ID:
            orderedScores.append(score)
# Add zero to beginning of list to make tracking "begin" and "end" easier in subsequent statistics.
orderedScores[0:0] = [0]

##################
# Statistics: Amino Acids
##################
# Notes: (applies to both amino acid and nucleotide sequence statistics)
# * Statistics will encounter an error if "overflow" cells are in the original ELISA file, this is reflected in the length of cellAve being less than the total number of sequences. Replace "OVFLW" wells with "4".
# * If you encounter an error involving index values being out of range, this is because the sum of all sequence counts in 'uniqueDict' does not match the total in 'orderedScores'. Check the original sequencing file to make sure sequences aren't repeated.

# Retrieve max absorbance for ordered values.
uniqueMaxList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueMax = max(orderedScores[begin:end])
        uniqueMaxList.append(uniqueMax)
    # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list.
    except:        
        uniqueMaxList.append(orderedScores[end])
    begin += count
# Remove zero used for tracking purposes.
uniqueMaxList.pop(0)
# Retrieve min absorbance for ordered values.
uniqueMinList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueMin = min(orderedScores[begin:end])
        uniqueMinList.append(uniqueMin)
    # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list.        
    except:        
        uniqueMinList.append(orderedScores[end])
    begin += count
# Remove zero used for tracking purposes.
uniqueMinList.pop(0)
# Retrieve median absorbance for ordered values.
uniqueMedianList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueMedian = statistics.median(orderedScores[begin:end])
        uniqueMedianList.append(uniqueMedian)
    # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list.        
    except:        
        uniqueMedianList.append(orderedScores[end])
    begin += count
# Remove zero used for tracking purposes.
uniqueMedianList.pop(0)
# Retrieve mean absorbance for ordered values.
uniqueMeanList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueMean = statistics.mean(orderedScores[begin:end])
        uniqueMeanList.append(uniqueMean)
    # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list.
    except:        
        uniqueMeanList.append(orderedScores[end])
    begin += count
# Remove zero used for tracking purposes.
uniqueMeanList.pop(0)
# Retrieve stdev absorbance for ordered values.
uniqueStdevList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueStdev = statistics.stdev(orderedScores[begin:end])
        uniqueStdevList.append(uniqueStdev)
    # Above statistic won't work if only a single value so append "0.0" value to the list.
    except:        
        uniqueStdevList.append(0)
    begin += count
# Remove zero used for tracking purposes.
uniqueStdevList.pop(0)

# Write statistics to Unique AA Seq worksheet.
# Max.
maxRow = 2
maxCol = alignLen + 2
worksheet2.write(0, alignLen + 2, 'Max.', title_format)
for seq in uniqueMaxList:
    worksheet2.write(maxRow, maxCol, seq, stats_format)
    maxRow += 1
# Min.
minRow = 2
minCol = alignLen + 3
worksheet2.write(0, alignLen + 3, 'Min.', title_format)
for seq in uniqueMinList:
    worksheet2.write(minRow, minCol, seq, stats_format)
    minRow += 1
# Median.
medianRow = 2
medianCol = alignLen + 4
worksheet2.write(0, alignLen + 4, 'Median', title_format)
for seq in uniqueMedianList:
    worksheet2.write(medianRow, medianCol, seq, stats_format)
    medianRow += 1
# Mean.
meanRow = 2
meanCol = alignLen + 5
worksheet2.write(0, alignLen + 5, 'Mean', title_format)
for seq in uniqueMeanList:
    worksheet2.write(meanRow, meanCol, seq, stats_format)
    meanRow += 1
# Standard deviation.    
stdevRow = 2
stdevCol = alignLen + 6
worksheet2.write(0, alignLen + 6, 'St. Dev.', title_format)
for seq in uniqueStdevList:
    worksheet2.write(stdevRow, stdevCol, seq, stats_format)
    stdevRow += 1

# Associate specific well IDs with corresponding unique sequence (amino acids).
countID = []
begin = 0
for uniqueSeq, count in uniqueDict.items():
    end = int(count) + begin
    countID.append(orderedIndex[begin:end])
    begin += count
# Write IDs to worksheet.
worksheet2.write(0, alignLen + 7, 'Wells', title_format)
wellRow = 2
wellCol = alignLen + 7
countIDregex = re.compile(r"([A-Z][0-1][0-9])")
sep = ', '
for wellList in countID:
    wellList = countIDregex.findall(str(wellList))
    wellList = sep.join(wellList)
    worksheet2.write(wellRow, wellCol, wellList, wellList_format)
    wellRow += 1

# Assign arbitrary IDs to each unique amino acid sequence.
worksheet2.write(0, 0, 'ID', title_format)
numberList = list(range(1, len(uniqueDict) + 1))
idRow = 2
for number in numberList:
    worksheet2.write_number(idRow, 0, number, general_format)
    idRow += 1

##################
# 'All NT Seq' worksheet.
##################

# Create worksheet for all nucleotide sequences.
worksheet3 = workbook.add_worksheet('All NT Seq')
worksheet3.hide_gridlines(option=2)
worksheet3.set_column(0, 0, 15)
worksheet3.set_column(1, alignLen2, 3)
worksheet3.freeze_panes(0, 1)
logging.info('All NT Seq worksheet created.')
# Write IDs.
worksheet3.write(0, 0, 'ID', title_format)  
idRow = 2
for name in nameListNt:
    worksheet3.write(idRow, 0, name, wellID_format)
    idRow += 1
    logging.info('"%s" written to All NT Seq worksheet.' % (name))
# Write nucleotide sequences.
worksheet3.write(0, 4, 'Nucleotide Sequence', title_format)
seqRow = 2
seqCol = 1
for nt in ntList:
    letterList = list(nt)
    for letter in letterList:
        worksheet3.write(seqRow, seqCol, letter, sequence_format)
        seqCol +=1
    seqRow += 1
    seqCol = 1
logging.info('Nucleotide sequences written to All NT Seq worksheet.')
# Write ELISA absorbances.
absRow = 2
absCol = alignLen2 + 1
worksheet3.write(0, absCol, 'Normalized Absorbance', title_format)
for result in reducedRelAveListNt:
    worksheet3.write(absRow, absCol, result, stats_format)
    absRow += 1
    logging.info('"%s" written to All NT Seq worksheet.' % (result))
# Write nucleotide base pair numbers above sequences.
numberList2 = list(range(1, alignLen2 + 1))
bpCol = 1
for number in numberList2:
    worksheet3.write(1, bpCol, number, residue_format)
    bpCol += 1

##################
# 'Unique NT Seq' worksheet.
##################
# Create worksheet for unique nucleotide sequences.
worksheet4 = workbook.add_worksheet('Unique NT Seq')
worksheet4.hide_gridlines(option=2)
worksheet4.set_column(1, alignLen2, 3)
worksheet4.freeze_panes(0, 1)
logging.info('Unique NT Seq worksheet created.')
# Create list of unique amino acid sequences ordered by frequency.
unique = OrderedCounter(ntList)
unique = unique.most_common()
uniqueDict = dict(unique)
# Write unique nucleotide sequences to Unique NT Seq worksheet.
worksheet4.write(0, 4, 'Nucleotide Sequence', title_format)
uniqueRow = 2
uniqueCol = 1
for seq in uniqueDict.keys():
    letterList = list(seq)
    for letter in letterList:
        worksheet4.write(uniqueRow, uniqueCol, letter, sequence_format)
        uniqueCol += 1
    uniqueRow += 1
    uniqueCol = 1
logging.info('Unique sequences written to Unique NT Seq worksheet.')
# Add counts for each unique sequence.
countRow = 2
countCol = alignLen2 + 1
worksheet4.write(0, alignLen2 + 1, 'Count', title_format)
count = list(uniqueDict.values())
for number in count:
    worksheet4.write_number(countRow, countCol, number, general_format)
    countRow += 1
# Write nucleotide base pair numbers above sequences.
bpCol = 1
for number in numberList2:
    worksheet4.write(1, bpCol, number, residue_format)
    bpCol += 1

##################
# Ordered values: Nucleotides
##################

# Get ordered list of wells that correspond to unique sequences; necessary for subsequent statistics.
orderedSeq = []
for key in uniqueDict.keys():
    orderedSeq.append(key)
orderedIndex = []
for seq in orderedSeq:
    for key, value in newntDict.items():
        if seq in value:
            orderedIndex.append(key)  
# Get ordered values corresponding to ordered list of wells for unique sequences; necessary for subsequent statistics.            
orderedScores = []
for index in orderedIndex:
    for ID, score in aveCellsDictNt.items():
        if index == ID:
            orderedScores.append(score)
# Add zero to beginning of list to make tracking "begin" and "end" easier in subsequent statistics.
orderedScores[0:0] = [0]

##################
# Statistics: Nucleotides
##################

# Note: See same notes as for 'Statistics: Amino Acids'.
# Retrieve max absorbance for ordered values.
uniqueMaxList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueMax = max(orderedScores[begin:end])
        uniqueMaxList.append(uniqueMax)
    # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list.
    except:        
        uniqueMaxList.append(orderedScores[end])
    begin += count
# Remove zero used for tracking purposes.
uniqueMaxList.pop(0)
# Retrieve min absorbance for ordered values.
uniqueMinList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueMin = min(orderedScores[begin:end])
        uniqueMinList.append(uniqueMin)
    # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list.        
    except:        
        uniqueMinList.append(orderedScores[end])
    begin += count
# Remove zero used for tracking purposes.
uniqueMinList.pop(0)
# Retrieve median absorbance for ordered values.
uniqueMedianList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueMedian = statistics.median(orderedScores[begin:end])
        uniqueMedianList.append(uniqueMedian)
    # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list.        
    except:        
        uniqueMedianList.append(orderedScores[end])
    begin += count
# Remove zero used for tracking purposes.
uniqueMedianList.pop(0)
# Retrieve mean absorbance for ordered values.
uniqueMeanList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueMean = statistics.mean(orderedScores[begin:end])
        uniqueMeanList.append(uniqueMean)
    # Above statistic won't work if only a single value so append the "end" (i.e. only) value to the list.
    except:        
        uniqueMeanList.append(orderedScores[end])
    begin += count
# Remove zero used for tracking purposes.
uniqueMeanList.pop(0)
# Retrieve stdev absorbance for ordered values.
uniqueStdevList = [0]
begin = 1
for seq, count in uniqueDict.items():
    end = int(count) + begin
    try:
        uniqueStdev = statistics.stdev(orderedScores[begin:end])
        uniqueStdevList.append(uniqueStdev)
    # Above statistic won't work if only a single value so append "0.0" value to the list.
    except:        
        uniqueStdevList.append(0)
    begin += count
# Remove zero used for tracking purposes.
uniqueStdevList.pop(0)

# Write statistics to Unique Seq - NT worksheet.
# Max.
maxRow = 2
maxCol = alignLen2 + 2
worksheet4.write(0, alignLen2 + 2, 'Max.', title_format)
for seq in uniqueMaxList:
    worksheet4.write(maxRow, maxCol, seq, stats_format)
    maxRow += 1
# Min.
minRow = 2
minCol = alignLen2 + 3
worksheet4.write(0, alignLen2 + 3, 'Min.', title_format)
for seq in uniqueMinList:
    worksheet4.write(minRow, minCol, seq, stats_format)
    minRow += 1
# Median.
medianRow = 2
medianCol = alignLen2 + 4
worksheet4.write(0, alignLen2 + 4, 'Median', title_format)
for seq in uniqueMedianList:
    worksheet4.write(medianRow, medianCol, seq, stats_format)
    medianRow += 1
# Mean.
meanRow = 2
meanCol = alignLen2 + 5
worksheet4.write(0, alignLen2 + 5, 'Mean', title_format)
for seq in uniqueMeanList:
    worksheet4.write(meanRow, meanCol, seq, stats_format)
    meanRow += 1
# St. dev.    
stdevRow = 2
stdevCol = alignLen2 + 6
worksheet4.write(0, alignLen2 + 6, 'St. Dev.', title_format)
for seq in uniqueStdevList:
    worksheet4.write(stdevRow, stdevCol, seq, stats_format)
    stdevRow += 1

# Associate specific well IDs with corresponding unique sequence (nucleotides).
countID = []
begin = 0
for uniqueSeq, count in uniqueDict.items():
    end = int(count) + begin
    countID.append(orderedIndex[begin:end])
    begin += count
# Write IDs to worksheet.
worksheet4.write(0, alignLen2 + 7, 'Wells', title_format)
wellRow = 2
wellCol = alignLen2 + 7
countIDregex = re.compile(r"([A-Z][0-1][0-9])")
sep = ', '
for wellList in countID:
    wellList = countIDregex.findall(str(wellList))
    wellList = sep.join(wellList)
    worksheet4.write(wellRow, wellCol, wellList, wellList_format)
    wellRow += 1

# Assign arbitrary IDs to each unique nucleotide sequence.
worksheet4.write(0, 0, 'ID', title_format)
numberList = list(range(1, len(uniqueDict) + 1))
idRow = 2
for number in numberList:
    worksheet4.write_number(idRow, 0, number, general_format)
    idRow += 1

##################
# Final workbook formatting.
##################

# Conditionally format statistics columns.
worksheet1.conditional_format(1, alignLen + 1, len(newNameListAa) + 1, alignLen + 1, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'})
worksheet2.conditional_format(1, alignLen + 2, len(unique) + 1, alignLen + 6, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'})
worksheet3.conditional_format(1, alignLen2 + 1, len(newNameListNt) + 1, alignLen2 + 1, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'})
worksheet4.conditional_format(1, alignLen2 + 2, len(unique) + 1, alignLen2 + 6, {'type': '2_color_scale', 'min_color': '#FAFAFA', 'max_color': '#008000'})   

# Transform data into proper Excel-formatted tables without any design style applied.
worksheet1.add_table(1, 0, len(newNameListAa) + 1, alignLen + 1, {'header_row': False, 'style': None})
worksheet2.add_table(1, 0, len(OrderedCounter(aaList)) + 1, alignLen + 7, {'header_row': False, 'style': None})
worksheet3.add_table(1, 0, len(nameListNt) + 1, alignLen2 + 1, {'header_row': False, 'style': None})
worksheet4.add_table(1, 0, len(OrderedCounter(ntList)) + 1, alignLen2 + 7, {'header_row': False, 'style': None})

# Close .xlsx file.
workbook.close()

# Conclusion.
print(green('\nExcel sequence alignment with ELISA absorbances saved as %s_analysed.xlsx.' % (elisaFileShort)))
logging.info('Excel file exported as %s_analysed.xlsx.' % (elisaFileShort))
print(green('\nAnalysis finished. See log file for details.'))
logging.info('phageDisplayElisaAnalysis.py finished running.')

# Shutdown logging.
logging.shutdown()